import difflib
import pandas as pd

def str_similar(s1,s2):
    return difflib.SequenceMatcher(None, s1, s2).quick_ratio()

def synonym(s_rate=0.5):
    data=pd.read_excel('nodes_relation.xlsx',sheet_name='nodes')
    df=data[['表中文名','字段中文名']][data['字段中文名入库ES']=='是']
    d=list(zip(df['表中文名'],df['字段中文名']))
    li=[]
    for i in range(len(d)):
        l=[]
        for j in range(len(d)):
            if (j!=i):
                s = str_similar(d[i][1],d[j][1])
                if s>s_rate:
                    l.append(d[j])
        li.append(l)
    df['相似字段']=li
    df.to_csv('synonym.csv',encoding = 'utf_8_sig')


if __name__ == '__main__':
    synonym()